Restarted abm_dev (Python 3.9.19)

In [ ]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    accuracy_score,
    confusion_matrix,
    classification_report,
)
from sklearn import linear_model
from sklearn import tree
from sklearn.cluster import KMeans

from pumf_2016_data import pumf_2016_util

# StandardScaler preferred over Normalizer, see note 1
# note 1: https://datascience.stackexchange.com/questions/45900/when-to-use-standard-scaler-and-when-normalizer
In [ ]:
df = pumf_2016_util.read_indiv_table()
pumf_2016_util.recode_indiv_table(df)

# list columns
print(list(df.columns))

df_2016 = df.copy()
del df
{'Female': 0, 'Male': 1}
{'No bedroom': 0.0, '1 bedroom': 1.0, '2 bedrooms': 2.0, '3 bedrooms': 3.0, '4 bedrooms': 4.0, '5 bedrooms or more': 5.0, 'Not available': 6.0}
['PPSORT', 'WEIGHT', 'WT1', 'WT2', 'WT3', 'WT4', 'WT5', 'WT6', 'WT7', 'WT8', 'WT9', 'WT10', 'WT11', 'WT12', 'WT13', 'WT14', 'WT15', 'WT16', 'ABOID', 'AGEGRP', 'AGEIMM', 'ATTSCH', 'BedRm', 'BFNMEMB', 'CapGn', 'CFInc', 'CFInc_AT', 'CfSize', 'CFSTAT', 'CHDBN', 'ChldC', 'CIP2011', 'CIP2011_STEM_SUM', 'Citizen', 'CitOth', 'CMA', 'CONDO', 'COW', 'CQPPB', 'DETH123', 'DIST', 'DPGRSUM', 'DTYPE', 'EFDecile', 'EfDIMBM', 'EFInc', 'EFInc_AT', 'EfSize', 'EICBN', 'EmpIn', 'ETHDER', 'FOL', 'FPTWK', 'GENSTAT', 'GovtI', 'GTRfs', 'HCORENEED_IND', 'HDGREE', 'HHInc', 'HHInc_AT', 'HHMRKINC', 'HHSIZE', 'HHTYPE', 'HLAEN', 'HLAFR', 'HLANO', 'HLBEN', 'HLBFR', 'HLBNO', 'IMMCAT5', 'IMMSTAT', 'IncTax', 'Invst', 'KOL', 'LFACT', 'LICO', 'LICO_AT', 'LOC_ST_RES', 'LOCSTUD', 'LoLIMA', 'LoLIMB', 'LoMBM', 'LSTWRK', 'LWAEN', 'LWAFR', 'LWANO', 'LWBEN', 'LWBFR', 'LWBNO', 'MarStH', 'MOB1', 'Mob5', 'MODE', 'MrkInc', 'MTNEn', 'MTNFr', 'MTNNO', 'NAICS', 'NOC16', 'NOCS', 'NOL', 'NOS', 'OASGI', 'OtInc', 'PKID0_1', 'PKID15_24', 'PKID2_5', 'PKID25', 'PKID6_14', 'PKIDS', 'POB', 'POBF', 'POBM', 'POWST', 'PR', 'PR1', 'PR5', 'PresMortG', 'PRIHM', 'PWDUR', 'PWLEAVE', 'PWOCC', 'PWPR', 'REGIND', 'REPAIR', 'Retir', 'ROOMS', 'SempI', 'Sex', 'SHELCO', 'SSGRAD', 'Subsidy', 'Tenur', 'TotInc', 'TotInc_AT', 'VALUE', 'VisMin', 'Wages', 'WKSWRK', 'WRKACT', 'YRIMM', 'AGE', 'AGEBIN', 'MOB1_cat', 'MOB5_cat', 'HHSIZE_int', 'NOCS_cat', 'NOCS_cat_combined', 'Sex_cat', 'BedRm_cat']
In [ ]:
# # %%

# preloaded_fea_file = "data/census_pumf_vancouver_2021.fea"
# data_path = "../data_packages/PUMF_Census_2021_StatCan/ind/data_donnees_2021_ind_v2.csv"


# if not os.path.exists(preloaded_fea_file):
#     df = pd.read_csv(data_path)
#     # import pyreadstat
#     # df_2021, meta = pyreadstat.read_dta(data_path)
#     # filter for CMA equals to Vancouver
#     # source: https://www23.statcan.gc.ca/imdb/p3VD.pl?Function=getVD&TVD=317043&CVD=317046&CPV=59A&CST=01012016&CLV=3&MLV=5
#     # df = df[df["CMA"] == "Vancouver"].reset_index(
#     #     drop=True
#     # )  # filter by Vancouver CMA only
#     # save to feather to allow faster loading later
#     df.to_feather(preloaded_fea_file)
# else:
#     df = pd.read_feather(preloaded_fea_file)

# # list columns
# print(list(df.columns))

# df_2021 = df.copy()
# del df
In [ ]:
df = df_2016
In [ ]:
# population by age

plot_df = pd.pivot_table(
    df, values="WEIGHT", index="AGEBIN", columns=None, aggfunc="sum"
).reset_index()

plt.figure(figsize=(12, 6))
chart = sns.barplot(data=plot_df, x="AGEBIN", y="WEIGHT", palette="Paired")
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-5-4cc62d9ae302>:10: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  chart = sns.barplot(data=plot_df, x="AGEBIN", y="WEIGHT", palette="Paired")
<ipython-input-5-4cc62d9ae302>:11: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
# population of age by household size

plt.figure(figsize=(8, 7))
plot_df = pd.pivot_table(
    df, values="WEIGHT", index="AGEBIN", columns="HHSIZE", aggfunc="sum"
)
sns.heatmap(plot_df, annot=False, cmap="Blues")  # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='AGEBIN'>
No description has been provided for this image
In [ ]:
# population of age by household size, Male

plt.figure(figsize=(8, 7))
plot_df = pd.pivot_table(
    df[df["Sex"] == "Male"],
    values="WEIGHT",
    index="AGEBIN",
    columns="HHSIZE",
    aggfunc="sum",
)
sns.heatmap(plot_df, annot=False, cmap="Greens")  # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='AGEBIN'>
No description has been provided for this image
In [ ]:
# population of age by household size, Female

plt.figure(figsize=(8, 7))
plot_df = pd.pivot_table(
    df[df["Sex"] == "Female"],
    values="WEIGHT",
    index="AGEBIN",
    columns="HHSIZE",
    aggfunc="sum",
)
sns.heatmap(plot_df, annot=False, cmap="Purples")  # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='AGEBIN'>
No description has been provided for this image
In [ ]:
# mobility status (moving) 5 year by household size

plot_df = pd.pivot_table(
    df, values="WEIGHT", index="Mob5", columns="HHSIZE", aggfunc="sum"
)
sns.heatmap(plot_df, annot=False, cmap="Blues")  # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='Mob5'>
No description has been provided for this image
In [ ]:
# mobility status (moving) 5 year by household size, exclude non movers

plot_df = pd.pivot_table(
    df[~df["Mob5"].isin(["Non-migrants", "Non-movers", "Not applicable"])],
    values="WEIGHT",
    index="Mob5",
    columns="HHSIZE",
    aggfunc="sum",
    observed=True,
)
sns.heatmap(plot_df, annot=False, cmap="Blues")  # cmap="flare")
Out[ ]:
<Axes: xlabel='HHSIZE', ylabel='Mob5'>
No description has been provided for this image
In [ ]:
# total income by household size

df_processed = df[(df["EmpIn"] < 88888888)]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
plot_df = pd.pivot_table(
    df_processed,
    values="WEIGHT",
    index="HHSIZE",
    columns="EmpIn_cat",
    aggfunc="sum",
    observed=True,
)
sns.heatmap(plot_df, annot=False, cmap="Blues")  # cmap="flare")
<ipython-input-11-f7b35357bf58>:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
Out[ ]:
<Axes: xlabel='EmpIn_cat', ylabel='HHSIZE'>
No description has been provided for this image
In [ ]:
# total income by household size, with children only

df_processed = df[(df["EmpIn"] < 88888888)]
df_processed = df_processed[(df_processed["PKIDS"] == "One or more")]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
plot_df = pd.pivot_table(
    df_processed,
    values="WEIGHT",
    index="HHSIZE",
    columns="EmpIn_cat",
    aggfunc="sum",
    observed=True,
)
sns.heatmap(plot_df, annot=False, cmap="Blues")  # cmap="flare")
Out[ ]:
<Axes: xlabel='EmpIn_cat', ylabel='HHSIZE'>
No description has been provided for this image
In [ ]:
# employment income by hours of work, across sex

plt.figure(figsize=(20, 40))

df_processed = df[(df["EmpIn"] < 88888888)]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
df_processed2 = df_processed[["Sex", "WKSWRK", "EmpIn_cat", "WEIGHT"]].reset_index()
plot_df = pd.pivot_table(
    df_processed,
    values="WEIGHT",
    index=["Sex", "WKSWRK", "EmpIn_cat"],
    columns=None,
    aggfunc="sum",
    observed=True,
).reset_index()
# sns.heatmap(plot_df, annot=False, cmap="Blues")  # cmap="flare")

fg = sns.FacetGrid(plot_df, col="Sex", col_wrap=3, height=6, aspect=0.75)


def draw_heatmap(*args, **kwargs):
    data = kwargs.pop("data")
    d = data.pivot(index=args[1], columns=args[0], values=args[2])
    sns.heatmap(d, **kwargs)


fg.map_dataframe(
    draw_heatmap,
    "EmpIn_cat",
    "WKSWRK",
    "WEIGHT",
    cmap="Blues",
    cbar=False,
    square=False,
)
# fg.set(yticks=[])
# fg.set(xticks=[])
plt.show()
<ipython-input-13-3c371c939ffb>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
<Figure size 2000x4000 with 0 Axes>
No description has been provided for this image
In [ ]:
# employment income, by work activity, across sex

plt.figure(figsize=(20, 40))

df_processed = df[(df["EmpIn"] < 88888888)]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
df_processed2 = df_processed[["Sex", "WRKACT", "EmpIn_cat", "WEIGHT"]].reset_index()
plot_df = pd.pivot_table(
    df_processed,
    values="WEIGHT",
    index=["Sex", "WRKACT", "EmpIn_cat"],
    columns=None,
    aggfunc="sum",
    observed=True,
).reset_index()
# sns.heatmap(plot_df, annot=False, cmap="Blues")  # cmap="flare")

fg = sns.FacetGrid(plot_df, col="Sex", col_wrap=3, height=6, aspect=0.6)


def draw_heatmap(*args, **kwargs):
    data = kwargs.pop("data")
    d = data.pivot(index=args[1], columns=args[0], values=args[2])
    sns.heatmap(d, **kwargs)


fg.map_dataframe(
    draw_heatmap,
    "EmpIn_cat",
    "WRKACT",
    "WEIGHT",
    cmap="Blues",
    cbar=False,
    square=False,
)
# fg.set(yticks=[])
# fg.set(xticks=[])
plt.show()
<ipython-input-14-b3dc6934e354>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
<Figure size 2000x4000 with 0 Axes>
No description has been provided for this image
In [ ]:
# employment income, by employment occupations, across sex

plt.figure(figsize=(20, 40))

df_processed = df[(df["EmpIn"] < 88888888)]
bins = list(range(0, 175000, 25000))
df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
df_processed2 = df_processed[["Sex", "NOCS", "EmpIn_cat", "WEIGHT"]].reset_index()
plot_df = pd.pivot_table(
    df_processed,
    values="WEIGHT",
    index=["Sex", "NOCS", "EmpIn_cat"],
    columns=None,
    aggfunc="sum",
    observed=True,
).reset_index()
# sns.heatmap(plot_df, annot=False, cmap="Blues")  # cmap="flare")

fg = sns.FacetGrid(plot_df, col="Sex", col_wrap=3, height=6, aspect=0.8)


def draw_heatmap(*args, **kwargs):
    data = kwargs.pop("data")
    d = data.pivot(index=args[1], columns=args[0], values=args[2])
    sns.heatmap(d, **kwargs)


fg.map_dataframe(
    draw_heatmap,
    "EmpIn_cat",
    "NOCS",
    "WEIGHT",
    cmap="Blues",
    cbar=False,
    square=False,
)
# fg.set(yticks=[])
# fg.set(xticks=[])
plt.show()
<ipython-input-15-4732fe8487ea>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed["EmpIn_cat"] = pd.cut(df_processed["EmpIn"], bins)
<Figure size 2000x4000 with 0 Axes>
No description has been provided for this image
In [ ]:
# employment status across gender
df_processed = df.copy()
df_processed = df_processed[(df_processed["NOCS_cat"] != "Not")]
df_processed = df_processed[(df_processed["EmpIn"] < 88888888)]
df_processed = df_processed[(df_processed["EmpIn"] != 1)]
df_processed = df_processed[(df_processed["EmpIn"] != -1)]
df_processed = df_processed[(df_processed["EmpIn"] != 0)]
df_processed = df_processed[(df_processed["EmpIn"] > 0)]
df_processed = df_processed[(df_processed["EmpIn"] < 250000)]
df_processed = df_processed.sort_values(["NOCS_cat", "AGEBIN"], ascending=True)

plt.figure(figsize=(10, 5))
chart = sns.violinplot(
    data=df_processed,
    x="NOCS",
    y="EmpIn",
    hue="Sex",
    hue_order=["Male", "Female"],
    split=True,
    inner="quart",
    cut=0,
    palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-16-2fb64a6dc72b>:15: UserWarning: The palette list has more values (10) than needed (2), which may not be intended.
  chart = sns.violinplot(
<ipython-input-16-2fb64a6dc72b>:26: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.lineplot(
    x="NOCS", y="EmpIn", hue="Sex", data=df_processed, errorbar=("ci", 95)
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-17-21eff3bcfbf8>:7: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
    data=df_processed,
    x="WKSWRK",
    y="EmpIn",
    hue="Sex",
    hue_order=["Male", "Female"],
    split=True,
    inner="quart",
    cut=0,
    palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-18-89ab36c3c581>:4: UserWarning: The palette list has more values (10) than needed (2), which may not be intended.
  chart = sns.violinplot(
<ipython-input-18-89ab36c3c581>:15: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
    data=df_processed,
    x="WRKACT",
    y="EmpIn",
    hue="Sex",
    hue_order=["Male", "Female"],
    split=True,
    inner="quart",
    cut=0,
    palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-19-a371957deac1>:4: UserWarning: The palette list has more values (10) than needed (2), which may not be intended.
  chart = sns.violinplot(
<ipython-input-19-a371957deac1>:15: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
    data=df_processed[df_processed["HHSIZE"] != "Not available"],
    x="HHSIZE",
    y="EmpIn",
    hue="Sex",
    hue_order=["Male", "Female"],
    split=True,
    inner="quart",
    cut=0,
    palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-20-4a8355507187>:4: UserWarning: The palette list has more values (10) than needed (2), which may not be intended.
  chart = sns.violinplot(
<ipython-input-20-4a8355507187>:15: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 5))

chart = sns.lineplot(
    x="HHSIZE",
    y="EmpIn",
    hue="Sex",
    data=df_processed[df_processed["HHSIZE"] != "Not available"],  # style="event",
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-21-c28e057a1d4e>:11: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
df_processed2 = df_processed[df_processed["AGE"] != -1]
df_processed2["AGEBIN"] = df_processed2["AGEBIN"].astype(str)
df_processed2 = df_processed2.sort_values(["AGEBIN"], ascending=True)
plt.figure(figsize=(10, 5))
chart = sns.violinplot(
    data=df_processed2,
    x="AGEBIN",
    y="EmpIn",
    hue="Sex",
    hue_order=["Male", "Female"],
    split=True,
    inner="quart",
    cut=0,
    palette=sns.color_palette(),
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-22-092f4026b097>:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed2["AGEBIN"] = df_processed2["AGEBIN"].astype(str)
<ipython-input-22-092f4026b097>:7: UserWarning: The palette list has more values (10) than needed (2), which may not be intended.
  chart = sns.violinplot(
<ipython-input-22-092f4026b097>:18: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 5))
chart = sns.lineplot(
    x="AGEBIN", y="EmpIn", hue="Sex", data=df_processed2, errorbar=("ci", 95)
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
plt.show()
<ipython-input-23-2a14bc8ed973>:7: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  chart.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
No description has been provided for this image
In [ ]:
# Household size and age

df_processed3 = df_processed2[~(df_processed2["HHSIZE"] == "Not available")]
df_processed3["HHSIZE4"] = df_processed3["HHSIZE"].astype(str)
df_processed3["HHSIZE4"] = df_processed3["HHSIZE4"].map(
    {
        "1 person": "1 person",
        "2 persons": "2 persons",
        "3 persons": "3 persons",
        "4 persons": "4+ persons",
        "5 persons": "4+ persons",
        "6 persons": "4+ persons",
        "7 persons": "4+ persons",
    }
)
df_processed3 = df_processed3.sort_values(["HHSIZE4", "AGEBIN"], ascending=True)

g = sns.relplot(
    data=df_processed3,
    x="AGEBIN",
    y="EmpIn",
    col="HHSIZE4",
    hue="Sex",
    hue_order=["Male", "Female"],
    # style="event",
    kind="line",
    err_style="band",
    col_wrap=4,
    height=6,
    aspect=0.8,
    # palette="Set2",
    errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
<ipython-input-24-b00bf8fe8ab3>:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed3["HHSIZE4"] = df_processed3["HHSIZE"].astype(str)
<ipython-input-24-b00bf8fe8ab3>:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed3["HHSIZE4"] = df_processed3["HHSIZE4"].map(
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7f0ab7b80>
No description has been provided for this image
In [ ]:
# full time worker income by gender, across different number of weeks
df_processed3 = df_processed2[
    (
        ~df_processed2["WRKACT"].isin(
            [
                "Not available",
                "Not applicable",
                "Didn't work in 2015, worked in 2016",
                "Woked before 2015 or never worked",
            ]
        )
    )
    & (df_processed2["WRKACT"].str.contains("full time"))
]
df_processed3["WRKACT"] = (
    df_processed3["WRKACT"]
    .astype(str)
    .replace({"Worked 1 to 13 weeks full time": "Worked 01 to 13 weeks full time"})
)
df_processed3 = df_processed3.sort_values(["WRKACT", "AGEBIN"], ascending=True)
g = sns.relplot(
    data=df_processed3,
    x="AGEBIN",
    y="EmpIn",
    col="WRKACT",
    # col="HHSIZE",
    hue="Sex",
    hue_order=["Male", "Female"],
    # style="event",
    kind="line",
    err_style="band",
    col_wrap=5,
    height=6,
    aspect=0.8,
    # palette="Set2",
    errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
<ipython-input-25-660c8cd086a1>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed3["WRKACT"] = (
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7e65bbcd0>
No description has been provided for this image
In [ ]:
# part time worker income by gender, across different number of weeks
df_processed3 = df_processed2[
    (
        ~df_processed2["WRKACT"].isin(
            [
                "Not available",
                "Not applicable",
                "Didn't work in 2015, worked in 2016",
                "Woked before 2015 or never worked",
            ]
        )
    )
    & (df_processed2["WRKACT"].str.contains("part time"))
]
df_processed3["WRKACT"] = (
    df_processed3["WRKACT"]
    .astype(str)
    .replace({"Worked 1 to 13 weeks full time": "Worked 01 to 13 weeks full time"})
)
df_processed3 = df_processed3.sort_values(["WRKACT", "AGEBIN"], ascending=True)
g = sns.relplot(
    data=df_processed3,
    x="AGEBIN",
    y="EmpIn",
    col="WRKACT",
    # col="HHSIZE",
    hue="Sex",
    hue_order=["Male", "Female"],
    # style="event",
    kind="line",
    err_style="band",
    col_wrap=5,
    height=6,
    aspect=0.8,
    # palette="Set2",
    errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
<ipython-input-26-99aba2d0cb4b>:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_processed3["WRKACT"] = (
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7e61f90d0>
No description has been provided for this image
In [ ]:
# Weeks worked across occupation – All workers
df_processed2 = df_processed2.sort_values(["NOCS_cat", "AGEBIN"], ascending=True)

g = sns.relplot(
    data=df_processed2,
    x="AGEBIN",
    y="EmpIn",
    col="NOCS_cat",
    hue="Sex",
    hue_order=["Male", "Female"],
    # style="event",
    kind="line",
    col_wrap=5,
    height=6,
    aspect=0.8,
    # palette="Set2",
    errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7f2cb3160>
No description has been provided for this image
In [ ]:
# Weeks worked across occupation – Full time Full year
# df_processed3 = df_processed2[df_processed2["WRKACT"] != "Worked 49 to 52 weeks full time"]
df_processed3 = df_processed2.copy()
df_processed3 = df_processed3[
    (
        df_processed3["WRKACT"].isin(
            ["Worked 40 to 48 weeks full time", "Worked 49 to 52 weeks full time"]
        )
    )
]

g = sns.relplot(
    data=df_processed3,
    x="AGEBIN",
    y="EmpIn",
    col="NOCS_cat",
    hue="Sex",
    hue_order=["Male", "Female"],
    # style="event",
    kind="line",
    col_wrap=5,
    height=6,
    aspect=0.8,
    # palette="Set2",
    errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7f0cd7b80>
No description has been provided for this image
In [ ]:
# Weeks worked across occupation – Part time or Part year
# df_processed3 = df_processed2[df_processed2["WRKACT"] != "Worked 49 to 52 weeks full time"]
df_processed3 = df_processed2.copy()
df_processed3 = df_processed3[
    ~(
        df_processed3["WRKACT"].isin(
            ["Worked 40 to 48 weeks full time", "Worked 49 to 52 weeks full time"]
        )
    )
]

g = sns.relplot(
    data=df_processed3,
    x="AGEBIN",
    y="EmpIn",
    col="NOCS_cat",
    hue="Sex",
    hue_order=["Male", "Female"],
    # style="event",
    kind="line",
    col_wrap=5,
    height=6,
    aspect=0.8,
    # palette="Set2",
    errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
Out[ ]:
<seaborn.axisgrid.FacetGrid at 0x1d7f08c42b0>
No description has been provided for this image
In [ ]:
# Weeks worked across occupation – Part time or Part year - across weeks worked
# df_processed3 = df_processed2[df_processed2["WRKACT"] != "Worked 49 to 52 weeks full time"]
df_processed3 = df_processed2.copy()
df_processed3 = df_processed3[
    ~(
        df_processed3["WRKACT"].isin(
            ["Worked 40 to 48 weeks full time", "Worked 49 to 52 weeks full time"]
        )
    )
]
df_processed3["WRKACT"] = df_processed3["WRKACT"].astype(str)

g = sns.relplot(
    data=df_processed3,
    x="NOCS_cat",
    y="EmpIn",
    col="WRKACT",
    hue="Sex",
    hue_order=["Male", "Female"],
    # style="event",
    kind="line",
    col_wrap=5,
    height=6,
    aspect=0.8,
    # palette="Set2",
    errorbar=("ci", 95),
)
g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
g.add_legend()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File c:\Users\bwen\OneDrive - TransLink\PROJECTS\census-pumf\pumf_analysis\analysis.py:29
     12 df_processed3["WRKACT"] = df_processed3["WRKACT"].astype(str)
     14 g = sns.relplot(
     15     data=df_processed3,
     16     x="NOCS_cat",
   (...)
     27     errorbar=("ci", 95),
     28 )
---> 29 g.set_xticklabels(chart.get_xticklabels(), rotation=45, horizontalalignment="right")
     30 g.add_legend()

File c:\Users\bwen\.conda\envs\abm_dev\lib\site-packages\seaborn\axisgrid.py:929, in FacetGrid.set_xticklabels(self, labels, step, **kwargs)
    927         ax.set_xticklabels(curr_labels, **kwargs)
    928     else:
--> 929         ax.set_xticklabels(labels, **kwargs)
    930 return self

File c:\Users\bwen\.conda\envs\abm_dev\lib\site-packages\matplotlib\axes\_base.py:74, in _axis_method_wrapper.__set_name__.<locals>.wrapper(self, *args, **kwargs)
     73 def wrapper(self, *args, **kwargs):
---> 74     return get_method(self)(*args, **kwargs)

File c:\Users\bwen\.conda\envs\abm_dev\lib\site-packages\matplotlib\axis.py:2062, in Axis.set_ticklabels(self, labels, minor, fontdict, **kwargs)
   2058 elif isinstance(locator, mticker.FixedLocator):
   2059     # Passing [] as a list of labels is often used as a way to
   2060     # remove all tick labels, so only error for > 0 labels
   2061     if len(locator.locs) != len(labels) and len(labels) != 0:
-> 2062         raise ValueError(
   2063             "The number of FixedLocator locations"
   2064             f" ({len(locator.locs)}), usually from a call to"
   2065             " set_ticks, does not match"
   2066             f" the number of labels ({len(labels)}).")
   2067     tickd = {loc: lab for loc, lab in zip(locator.locs, labels)}
   2068     func = functools.partial(self._format_with_dict, tickd)

ValueError: The number of FixedLocator locations (10), usually from a call to set_ticks, does not match the number of labels (13).
No description has been provided for this image
In [ ]:
# Full time part time may be a predictor for woman's income
#
Cell was canceled due to an error in a previous cell.
In [ ]:
# Analysis of wages
# LFACT == 1
df = df[(df.Wages < 88888888) & (df.LFACT == "Employed - Worked in reference week")]
df = df.reset_index(drop=True)
Cell was canceled due to an error in a previous cell.
In [ ]:
sns.boxplot(data=df, x="Sex", y="Wages")
Cell was canceled due to an error in a previous cell.
In [ ]:
sns.kdeplot(data=df, hue="Sex", x="Wages", bw_adjust=1.7)
Cell was canceled due to an error in a previous cell.
In [ ]:
sns.scatterplot(data=df, x="Wages", y="AGE")
sns.rugplot(data=df, x="Wages", y="AGE")
Cell was canceled due to an error in a previous cell.
In [ ]:
sns.kdeplot(data=df, x="Wages", hue="Sex")
sns.rugplot(data=df, x="Wages", hue="Sex")
Cell was canceled due to an error in a previous cell.
In [ ]:
df[df.Wages < 150000].plot.hexbin(x="Wages", y="AGE", gridsize=20)
Cell was canceled due to an error in a previous cell.
In [ ]:
wages_mobility = pd.pivot_table(
    df, index="Mob5", columns="MOB1", values="Wages", aggfunc="mean"
)
sns.heatmap(wages_mobility, annot=False, cmap="crest", vmin=30000, vmax=100000)
Cell was canceled due to an error in a previous cell.
In [ ]:
print(list(df.columns))
Cell was canceled due to an error in a previous cell.

Regression Models

In [ ]:
# variable recoding

# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
# https://stackoverflow.com/questions/58101126/using-scikit-learn-onehotencoder-with-a-pandas-dataframe
# # try OneHotEncoder
# oht_encode = OneHotEncoder()
# oht_encode.fit(df['MOB5_cat'].unique().reshape(-1, 1))
# transformed = oht_encode.transform(
#     df['MOB5_cat'].to_numpy().reshape(-1, 1)).toarray()
# ohe_df = pd.DataFrame(transformed, columns=oht_encode.get_feature_names_out())
# df = pd.concat([df, ohe_df], axis=1).drop(['MOB5_cat'], axis=1)
# do the same thing with get dummies
df = pd.get_dummies(df, prefix=["MOB5_cat"], columns=["MOB5_cat"], drop_first=False)
df = pd.get_dummies(df, prefix=["MOB1_cat"], columns=["MOB1_cat"], drop_first=False)
df = pd.get_dummies(df, prefix=["NOCS_cat"], columns=["NOCS_cat"], drop_first=False)
Cell was canceled due to an error in a previous cell.
In [ ]:
# variable selection
# y - Wages
# X
# - MOB1_cat: moved in the last year
# - MOB5_cat: moved in the last 5 years
# "HHSIZE_int", "Sex_cat", "BedRm_cat", "MOB5_cat_moved_to_canada_5", "MOB5_cat_moved_within_canada_5", "MOB5_cat_not_moved_5", "MOB1_cat_moved_to_canada_1", "MOB1_cat_moved_within_canada_1", "MOB1_cat_not_moved_1"

# A1. simple regression model
pred_var = "Wages"
variable_list = [
    "AGE",
    "Sex_cat",
    "BedRm_cat",
    "MOB5_cat_moved_to_canada_5",
    "MOB5_cat_moved_within_canada_5",
    # "MOB5_cat_not_moved_5",
    # "MOB1_cat_moved_to_canada_1",
    # "MOB1_cat_moved_within_canada_1",
    "MOB1_cat_not_moved_1",
    "NOCS_cat_Not",
    "NOCS_cat_art_trades_manuf",
    # "NOCS_cat_busi",
    "NOCS_cat_health_edu_law",
    "NOCS_cat_mgmt_sci",
    "NOCS_cat_sales_agri",
]

X, y = df[variable_list], df[pred_var]

X = sm.add_constant(X)
variable_list.append("const")
model = sm.OLS(y, X).fit()
print(model.summary())
Cell was canceled due to an error in a previous cell.
In [ ]:
# A2. Simple regression model with formula (simple OLS)
# using formula is very convenient for specifying different models
# https://www.statsmodels.org/stable/api.html#statsmodels-formula-api
# print(' + '.join(variable_list))
results = smf.ols(
    "Wages ~ np.log(AGE) + Sex_cat + BedRm_cat + MOB5_cat_moved_to_canada_5 + MOB5_cat_moved_within_canada_5 + MOB1_cat_not_moved_1 + NOCS_cat_Not + NOCS_cat_art_trades_manuf + NOCS_cat_health_edu_law + NOCS_cat_mgmt_sci + NOCS_cat_sales_agri",
    data=df,
).fit()
print(results.summary())
Cell was canceled due to an error in a previous cell.
In [ ]:
# read: https://machinelearningresearch.quora.com/What-is-difference-between-ordinary-linear-regression-and-generalized-linear-model-or-linear-regression

# A3. Generalize Linear
results = smf.gls(
    "Wages ~ np.log(AGE) + Sex_cat + BedRm_cat + MOB5_cat_moved_to_canada_5 + MOB5_cat_moved_within_canada_5 + MOB1_cat_not_moved_1 + NOCS_cat_Not + NOCS_cat_art_trades_manuf + NOCS_cat_health_edu_law + NOCS_cat_mgmt_sci + NOCS_cat_sales_agri",
    data=df,
).fit()
print(results.summary())
Cell was canceled due to an error in a previous cell.

Model options from StatsModel¶

  • OLS: Ordinary Least Square - https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.OLS.html
  • WLS: Weighted Least Square - https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.WLS.html#statsmodels.regression.linear_model.WLS
  • GLS: Generalized Least Square - https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.GLS.html#statsmodels.regression.linear_model.GLS
  • GLSAR: Generalized Least Squares with AR covariance structure - specify covariance structure
  • MixedLM: Linear Mixed Effects - https://www.statsmodels.org/stable/generated/statsmodels.regression.mixed_linear_model.MixedLM.html#statsmodels.regression.mixed_linear_model.MixedLM
  • GEE, Ordinal GEE, Nominal GEE: Estimating equations / structural equations
  • RLM: Robust linear model - https://www.statsmodels.org/stable/generated/statsmodels.robust.robust_linear_model.RLM.html#statsmodels.robust.robust_linear_model.RLM Discrete Choices - https://www.statsmodels.org/stable/examples/notebooks/generated/discrete_choice_overview.html
  • Logit: Logistic regression - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.Logit.html#statsmodels.discrete.discrete_model.Logit
  • Probit: Probit regression - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.Probit.html#statsmodels.discrete.discrete_model.Probit
  • MNLogit: Multinomial logit regression - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.MNLogit.html#statsmodels.discrete.discrete_model.MNLogit
  • Poisson: Poisson discrete choice - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.Poisson.html#statsmodels.discrete.discrete_model.Poisson
  • Negative Binomial discrete choice - https://www.statsmodels.org/stable/generated/statsmodels.discrete.discrete_model.NegativeBinomial.html#statsmodels.discrete.discrete_model.NegativeBinomial Misc:
  • Quantile Regression - https://www.statsmodels.org/stable/generated/statsmodels.regression.quantile_regression.QuantReg.html#statsmodels.regression.quantile_regression.QuantReg
  • Hazard Regression - https://www.statsmodels.org/stable/generated/statsmodels.duration.hazard_regression.PHReg.html#statsmodels.duration.hazard_regression.PHReg
  • GLM Additive Model - https://www.statsmodels.org/stable/generated/statsmodels.gam.generalized_additive_model.GLMGam.html#statsmodels.gam.generalized_additive_model.GLMGam feature selection
  • Variance Threshold
In [ ]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(0.8 * (1 - 0.8)))
sel.fit(X)
print(sel.get_feature_names_out())
Cell was canceled due to an error in a previous cell.
In [ ]:
import math

# B1: ML Regression
# https://scikit-learn.org/stable/supervised_learning.html#supervised-learning

# OLS Linear Regression using sklearn
reg = linear_model.LinearRegression()
reg.fit(X, y)
y_pred = reg.predict(X)
print(reg.intercept_, reg.coef_, reg.score(X, y))
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
Cell was canceled due to an error in a previous cell.
In [ ]:
# Ridge Regression
reg = linear_model.Ridge(alpha=0.5)
reg.fit(X, y)
y_pred = reg.predict(X)
print(reg.intercept_, reg.coef_, reg.score(X, y))
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
Cell was canceled due to an error in a previous cell.
In [ ]:
# Lasso Regression
reg = linear_model.Lasso(alpha=0.1)
reg.fit(X, y)
y_pred = reg.predict(X)
print(reg.intercept_, reg.coef_, reg.score(X, y))
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
Cell was canceled due to an error in a previous cell.
In [ ]:
# Elastic Net with Cross Grid
reg = linear_model.ElasticNetCV(cv=5, random_state=0)
reg.fit(X, y)
y_pred = reg.predict(X)
print(reg.alpha_)
print(reg.intercept_, reg.coef_, reg.score(X, y))
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")

# # %%
# # Support Vector Machine for Regression
# from sklearn import svm

# reg = svm.SVR()
# reg.fit(X, y)
# y_pred = reg.predict(X)
# print(reg.alpha_)
# print(reg.intercept_, reg.coef_, reg.score(X, y))
# print(f"r2: {r2_score(y, y_pred)}")
# print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
Cell was canceled due to an error in a previous cell.
In [ ]:
# Regression Tree
# https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html

# normalize data
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
transformer = Normalizer().fit(X)
Xt = transformer.transform(X)

reg = tree.DecisionTreeRegressor(max_depth=6)
reg.fit(Xt, y)
y_pred = reg.predict(Xt)
print(f"r2: {r2_score(y, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y, y_pred))}")
# tree.plot_tree(reg)
# https://stackoverflow.com/questions/68352933/name-of-variables-in-sklearn-pipeline
r = tree.export_text(reg, feature_names=variable_list)
print(r)
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = RandomForestClassifier(n_estimators=5)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = AdaBoostClassifier(n_estimators=5)
scores = cross_val_score(reg, X_train, y_train, cv=5)
print(f"scores: {scores}")
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = GradientBoostingRegressor(random_state=0)
scores = cross_val_score(reg, X_train, y_train, cv=5)
print(f"scores: {scores}")
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# Histogram-based Gradient Boosting
# https://scikit-learn.org/stable/modules/ensemble.html#histogram-based-gradient-boosting
from sklearn.ensemble import HistGradientBoostingRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = HistGradientBoostingRegressor(random_state=0)
scores = cross_val_score(reg, X_train, y_train, cv=5)
print(f"scores: {scores}")
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print(f"r2: {r2_score(y_test, y_pred)}")
print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")
# pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)
Cell was canceled due to an error in a previous cell.
In [ ]:
# # Artificial Neural Network
# from sklearn.neural_network import MLPClassifier

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

# reg = MLPClassifier(
#     solver="lbfgs", alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1
# )
# scores = cross_val_score(reg, X_train, y_train, cv=5)
# print(f"scores: {scores}")
# reg.fit(X_train, y_train)
# y_pred = reg.predict(X_test)
# print(f"r2: {r2_score(y_test, y_pred)}")
# print(f"rmse: {math.sqrt(mean_squared_error(y_test, y_pred))}")

# # ~25 mins
Cell was canceled due to an error in a previous cell.

Classification Models

In [ ]:
# C1: ML Classification

pred_var = "BedRm"
variable_list = [
    "AGE",
    "Wages",
    "Sex_cat",
    "MOB5_cat_moved_to_canada_5",
    "MOB5_cat_moved_within_canada_5",
    # "MOB5_cat_not_moved_5",
    # "MOB1_cat_moved_to_canada_1",
    # "MOB1_cat_moved_within_canada_1",
    "MOB1_cat_not_moved_1",
    "NOCS_cat_Not",
    "NOCS_cat_art_trades_manuf",
    # "NOCS_cat_busi",
    "NOCS_cat_health_edu_law",
    "NOCS_cat_mgmt_sci",
    "NOCS_cat_sales_agri",
]

X, y = df[variable_list], df[pred_var]
y_classes = list(df[pred_var].unique())
Cell was canceled due to an error in a previous cell.
In [ ]:
# Classification Tree
# https://scikit-learn.org/stable/auto_examples/tree/plot_tree_regression.html

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# normalize data
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.normalize.html
transformer = Normalizer().fit(X)
Xt = transformer.transform(X)

reg = tree.DecisionTreeClassifier(max_depth=6)
reg.fit(Xt, y)
y_pred = reg.predict(Xt)
print(f"score: {accuracy_score(y, y_pred)}")
print(classification_report(y, y_pred, target_names=y_classes))
# tree.plot_tree(reg)
# https://stackoverflow.com/questions/68352933/name-of-variables-in-sklearn-pipeline
r = tree.export_text(reg, feature_names=variable_list)
print(r)
pd.DataFrame({"name": list(X.columns), "value": reg.feature_importances_}).head(20)

# notes:
# Precision - TP / (TP + FP): fraction of true that is actually true
# Recall - TP / (TP + FN): fraction of true that was originally true
# f score - composite (avg) of precision and recall, 1 is perfect
Cell was canceled due to an error in a previous cell.

Clustering Models

In [ ]:
variable_list = [
    "AGE",
    "Wages",
    "Sex_cat",
    "MOB5_cat_moved_to_canada_5",
    "MOB5_cat_moved_within_canada_5",
    # "MOB5_cat_not_moved_5",
    # "MOB1_cat_moved_to_canada_1",
    # "MOB1_cat_moved_within_canada_1",
    "MOB1_cat_not_moved_1",
    "NOCS_cat_Not",
    "NOCS_cat_art_trades_manuf",
    # "NOCS_cat_busi",
    "NOCS_cat_health_edu_law",
    "NOCS_cat_mgmt_sci",
    "NOCS_cat_sales_agri",
]

X = df[variable_list]
Cell was canceled due to an error in a previous cell.
In [ ]:
# D1: ML Clustering

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
kmeans = KMeans(n_clusters=6, random_state=0).fit(X)
kmeans.labels_
y = kmeans.predict(X)
cc = kmeans.cluster_centers_
# dict(zip(list(range(0,len(cc))), cc))
cc_dict = {("cluster" + str(i)): cc[i] for i in range(0, len(cc))}
view_dict = {"name": variable_list}
view_dict.update(cc_dict)
pd.DataFrame(view_dict)
Cell was canceled due to an error in a previous cell.
In [ ]:
df["cluster"] = y
df.groupby("cluster").sum()["WEIGHT"].round(0) / df.sum()["WEIGHT"] * 100
Cell was canceled due to an error in a previous cell.
In [ ]:
# sns.scatterplot(data=df.sample(100), x="Wages", y="AGE", hue="cluster", size=10)

sns.displot(
    data=df.sample(100), x="Wages", y="AGE", hue="cluster", kind="kde", rug=True
)

# sns.displot(
#     data=df.sample(100), x="Wages", y="AGE", hue="cluster", kind="hist", rug=True
# )

pivot_df = df.pivot_table(index="cluster", columns="DPGRSUM", aggfunc="sum")["WEIGHT"]
pivot_df / pivot_df.sum() * 100
Cell was canceled due to an error in a previous cell.